wf <- st_read(file.path(shapeDir, "wholeFoods"))
## Reading layer `wholeFoods' from data source `/Users/kaz/Documents/Fall2020/WholeFoodsStudy/shape/wholeFoods' using driver `ESRI Shapefile'
## Simple feature collection with 2 features and 1 field
## geometry type: POINT
## dimension: XY
## bbox: xmin: -73.98821 ymin: 40.67497 xmax: -73.94837 ymax: 40.80765
## geographic CRS: WGS 84
wfHarlem <- wf %>% filter(store == "Whole Foods Harlem") %>% st_transform(2263)
wfHarlemBufferHalf <- wfHarlem %>% st_buffer(2640)
wfHarlemBufferOne <- wfHarlem %>% st_buffer(2640*2)
pluto <- st_read(file.path(shapeDir, "PLUTO_Dissolve")) %>% filter(Borough == "MN")
## Reading layer `PLUTO_Dissolve' from data source `/Users/kaz/Documents/Fall2020/WholeFoodsStudy/shape/PLUTO_Dissolve' using driver `ESRI Shapefile'
## Simple feature collection with 29922 features and 7 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 913128.9 ymin: 120049 xmax: 1067336 ymax: 272811.2
## projected CRS: NAD83 / New York Long Island (ftUS)
tm_shape(pluto, bbox = st_bbox(wfHarlemBufferOne)) +
tm_polygons() +
tm_shape(wfHarlemBufferHalf) +
tm_borders() +
tm_shape(wfHarlemBufferOne) +
tm_borders() +
tm_shape(wfHarlem) +
tm_symbols()
plutoWF <- pluto %>% filter(st_intersects(pluto, wfHarlemBufferOne,sparse = FALSE)) %>%
select(BoroName, Block) %>%
group_by(Block) %>%
summarize(BoroName = first(BoroName)) %>%
st_cast() %>%
ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)
plutoWF <- plutoWF %>%
filter(!Block %in% c(1111, 1897)) %>%
sf::st_join(wfHarlemBufferHalf %>%mutate(buffer = "Half"),join =st_within) %>%
dplyr::mutate(buffer = if_else(is.na(buffer), "Mile", buffer))
tm_shape(plutoWF) +
tm_polygons(col = "buffer")
permits <- read_csv(file.path(dataDir, "DOB_Permit_Issuance2010_2020.csv"))
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Bin #` = col_double(),
## `Job #` = col_double(),
## `Community Board` = col_double(),
## `Zip Code` = col_double(),
## `Bldg Type` = col_double(),
## `Permittee's Phone #` = col_double(),
## `Act as Superintendent` = col_logical(),
## `Permittee's Other Title` = col_logical(),
## PERMIT_SI_NO = col_double(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## COUNCIL_DISTRICT = col_double(),
## CENSUS_TRACT = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 3527 parsing failures.
## row col expected actual file
## 3392 Act as Superintendent 1/0/T/F/TRUE/FALSE Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 3540 Act as Superintendent 1/0/T/F/TRUE/FALSE Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 5760 Act as Superintendent 1/0/T/F/TRUE/FALSE Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 7670 Act as Superintendent 1/0/T/F/TRUE/FALSE Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## 8141 Act as Superintendent 1/0/T/F/TRUE/FALSE Y '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/DOB_Permit_Issuance2010_2020.csv'
## .... ..................... .................. ...... .....................................................................................
## See problems(...) for more details.
permits <- permits %>%
janitor::clean_names()%>%
select(borough, job_type,
block, lot, permit_type,
filing_date, latitude, longitude)
permits <- permits %>%
mutate(filing_date = lubridate::mdy_hms(filing_date),
year = lubridate::year(filing_date),
month = lubridate::month(filing_date))
permitsMN <- permits %>% filter(borough == "MANHATTAN")
rm(permits)
permitsWF <- permitsMN %>% mutate(Block = as.numeric(block)) %>%
left_join(plutoWF %>% st_drop_geometry()) %>%
filter(!is.na(buffer))
## Joining, by = "Block"
permitsWF %>%
group_by( month, year, buffer) %>%
summarize(date = min(filing_date),
count = n()) %>%
ggplot(aes(date, count)) +
geom_point(aes(color = buffer)) +
geom_smooth(aes(color = buffer)) +
#facet_wrap(~buffer) +
theme(legend.position = "none")
## `summarise()` regrouping output by 'month', 'year' (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sales <- read_csv(file.path(dataDir, "rollingSales","nycSales_2010_2019"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## NEIGHBORHOOD = col_character(),
## `BUILDING CLASS CATEGORY` = col_character(),
## `TAX CLASS AT PRESENT` = col_character(),
## `EASE-MENT` = col_logical(),
## `BUILDING CLASS AT PRESENT` = col_character(),
## ADDRESS = col_character(),
## `APARTMENT NUMBER` = col_character(),
## `BUILDING CLASS AT TIME OF SALE` = col_character(),
## `SALE DATE` = col_datetime(format = "")
## )
## See spec(...) for full column specifications.
## Warning: 1 parsing failure.
## row col expected actual file
## 310313 EASE-MENT 1/0/T/F/TRUE/FALSE E '/Users/kaz/Documents/Fall2020/WholeFoodsStudy/data/rollingSales/nycSales_2010_2019'
mnSales <- sales %>% filter(BOROUGH == 1) %>% janitor:::clean_names()
rm(sales)
# mnSalesJoin <- plutoWF %>%
# janitor:::clean_names() %>%
# left_join(mnSales, by = c("block" = "block")) %>%
# tidyr::separate(address, c("address","aptNum"), ",") %>%
# mutate(BL = paste0(block, stringr::str_pad(lot, 4,side = "left", pad = "0")),
# bbl = paste0(1, stringr::str_pad(block, 5,side = "left", pad = "0"), stringr::str_pad(lot, 4,side = "left", pad = "0")),
# aptNum = if_else(is.na(apartment_number), aptNum, apartment_number),
# time = factor(if_else(sale_date < ymd("2017-01-01"), "Before WF", "After WF"), levels = c("Before WF", "After WF")))
# library(geoclient)
#
# mnSalesJoin <- mnSalesJoin %>% filter(nchar(bbl) == 10) %>%
# mutate(bbl_use = geo_bbl(bbl)[["bbl"]])
mnSalesJoin <- readr::read_csv(file.path(dataDir, "mnSalesJoin.csv"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## borough.x = col_character(),
## boro_name = col_character(),
## ct2010_y = col_character(),
## store = col_character(),
## buffer = col_character(),
## neighborhood = col_character(),
## building_class_category = col_character(),
## tax_class_at_present = col_character(),
## ease_ment = col_logical(),
## building_class_at_present = col_character(),
## address = col_character(),
## aptNum = col_character(),
## apartment_number = col_character(),
## building_class_at_time_of_sale = col_character(),
## sale_date = col_datetime(format = ""),
## geometry = col_character(),
## time = col_character()
## )
## See spec(...) for full column specifications.
mnSalesSummary <- mnSalesJoin %>%
#st_drop_geometry() %>%
filter(sale_price > 50000) %>%
mutate(month = month(sale_date),
year = year(sale_date)) %>%
group_by(year, buffer) %>%
summarize(meanSale = mean(sale_price),
medSale = median(sale_price),
count = n())
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
mnSalesSummary %>%
ggplot(aes(year, meanSale)) +
geom_point(aes(color = buffer)) +
geom_smooth(aes(color = buffer))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#facet_wrap(~buffer) +
mnSalesSummary %>%
ggplot(aes(year, medSale)) +
geom_point(aes(color = buffer)) +
geom_smooth(aes(color = buffer))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
mnSalesSummary %>%
ggplot(aes(year, count)) +
geom_point(aes(color = buffer)) +
geom_smooth(aes(color = buffer))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
repeatLots <- mnSalesJoin %>%
#st_drop_geometry() %>%
filter(sale_price > 50000) %>%
group_by(BL, address, aptNum) %>% count() %>%
filter(n > 1) %>%
pull(BL)
mnSalesRepeat <- mnSalesJoin %>% filter(BL %in% repeatLots) %>%
arrange(BL)
mnSalesRepeat %>%
#st_drop_geometry() %>%
group_by(BL, buffer, time) %>%
summarize(last= last(sale_price)) %>%
ungroup() %>%
filter(last < 10000000, last > 50000) %>%
ggplot(aes(time, last)) +
geom_point(aes(group = BL, color = time)) +
geom_line(aes(group = BL), alpha = .2) +
facet_wrap(~buffer) +
theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
mnSalesRepeat %>%
#st_drop_geometry() %>%
group_by(BL, buffer, time) %>%
summarize(last= last(sale_price),
class = first(building_class_at_present)) %>%
ungroup() %>%
filter(last < 10000000, last > 50000, class == "R4") %>%
ggplot(aes(time, last)) +
geom_point(aes(group = BL, color = time)) +
geom_line(aes(group = BL), alpha = .2) +
facet_wrap(~buffer) +
theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
mnSalesRepeat %>%
#st_drop_geometry() %>%
filter(building_class_at_present == "R4") %>%
group_by(BL, buffer, time) %>%
summarize(last= last(sale_price),
class = first(building_class_at_present)) %>%
ungroup() %>%
group_by(buffer,time) %>%
summarize(avgPrice = mean(last),
medPrice = median(last)) %>%
ggplot(aes(time, avgPrice)) +
geom_point(aes(group = buffer, color = buffer)) +
geom_line(aes(group = buffer), alpha = .2) +
theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
## `summarise()` regrouping output by 'buffer' (override with `.groups` argument)
## Median change R4
mnSalesRepeat %>%
# st_drop_geometry() %>%
filter(building_class_at_present == "R4") %>%
group_by(BL, buffer, time) %>%
summarize(last= last(sale_price),
class = first(building_class_at_present)) %>%
ungroup() %>%
group_by(buffer,time) %>%
summarize(avgPrice = mean(last),
medPrice = median(last)) %>%
ggplot(aes(time, medPrice)) +
geom_point(aes(group = buffer, color = buffer)) +
geom_line(aes(group = buffer), alpha = .2) +
theme_minimal()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
## `summarise()` regrouping output by 'buffer' (override with `.groups` argument)
didreg = lm(last ~ buffer * time, data = mnSalesRepeat %>%
#st_drop_geometry() %>%
filter(building_class_at_present == "R4") %>%
group_by(BL, buffer, time) %>%
summarize(last= last(sale_price),
class = first(building_class_at_present)) %>%
ungroup() %>%
mutate(buffer = if_else(buffer == "Mile", 0, 1),
time = if_else(time =="Before WF", 0, 1)))
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
summary(didreg)
##
## Call:
## lm(formula = last ~ buffer * time, data = mnSalesRepeat %>% filter(building_class_at_present ==
## "R4") %>% group_by(BL, buffer, time) %>% summarize(last = last(sale_price),
## class = first(building_class_at_present)) %>% ungroup() %>%
## mutate(buffer = if_else(buffer == "Mile", 0, 1), time = if_else(time ==
## "Before WF", 0, 1)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1106681 -398308 -161098 194532 12571192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 836808 58201 14.378 < 2e-16 ***
## buffer 78660 110677 0.711 0.47759
## time 269872 98155 2.749 0.00618 **
## buffer:time -169564 166041 -1.021 0.30763
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 873000 on 513 degrees of freedom
## Multiple R-squared: 0.0159, Adjusted R-squared: 0.01014
## F-statistic: 2.762 on 3 and 513 DF, p-value: 0.04151
didreg = lm(last ~ buffer * time, data = mnSalesRepeat %>%
#st_drop_geometry() %>%
group_by(BL, buffer, time) %>%
summarize(last= last(sale_price),
class = first(building_class_at_present)) %>%
filter(last < 10000000, last > 50000) %>%
tidyr::pivot_wider(names_from = time, values_from = last) %>%
filter(!is.na(`Before WF`), !is.na(`After WF`)) %>%
tidyr::pivot_longer(names_to = "time", values_to = "last", -c(BL, buffer, class)) %>%
ungroup() %>%
mutate(buffer = if_else(buffer == "Mile", 0, 1),
time = if_else(time =="Before WF", 0, 1)))
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
summary(didreg)
##
## Call:
## lm(formula = last ~ buffer * time, data = mnSalesRepeat %>% group_by(BL,
## buffer, time) %>% summarize(last = last(sale_price), class = first(building_class_at_present)) %>%
## filter(last < 1e+07, last > 50000) %>% tidyr::pivot_wider(names_from = time,
## values_from = last) %>% filter(!is.na(`Before WF`), !is.na(`After WF`)) %>%
## tidyr::pivot_longer(names_to = "time", values_to = "last",
## -c(BL, buffer, class)) %>% ungroup() %>% mutate(buffer = if_else(buffer ==
## "Mile", 0, 1), time = if_else(time == "Before WF", 0, 1)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1495641 -798879 -449368 211031 8302959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1214542 96342 12.607 < 2e-16 ***
## buffer -69930 173757 -0.402 0.68749
## time 382499 136249 2.807 0.00516 **
## buffer:time 69715 245730 0.284 0.77674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1379000 on 588 degrees of freedom
## Multiple R-squared: 0.02139, Adjusted R-squared: 0.0164
## F-statistic: 4.284 on 3 and 588 DF, p-value: 0.00527
didreg = lm(last ~ buffer * time, data = mnSalesRepeat %>%
#st_drop_geometry() %>%
group_by(BL, buffer, time) %>%
summarize(last= last(sale_price),
class = first(building_class_at_present)) %>%
filter(last < 10000000, last > 50000) %>%
tidyr::pivot_wider(names_from = time, values_from = last) %>%
filter(!is.na(`Before WF`), !is.na(`After WF`)) %>%
tidyr::pivot_longer(names_to = "time", values_to = "last", -c(BL, buffer, class)) %>%
ungroup() %>% filter(class == "R4") %>%
mutate(buffer = if_else(buffer == "Mile", 0, 1),
time = if_else(time =="Before WF", 0, 1)))
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
summary(didreg)
##
## Call:
## lm(formula = last ~ buffer * time, data = mnSalesRepeat %>% group_by(BL,
## buffer, time) %>% summarize(last = last(sale_price), class = first(building_class_at_present)) %>%
## filter(last < 1e+07, last > 50000) %>% tidyr::pivot_wider(names_from = time,
## values_from = last) %>% filter(!is.na(`Before WF`), !is.na(`After WF`)) %>%
## tidyr::pivot_longer(names_to = "time", values_to = "last",
## -c(BL, buffer, class)) %>% ungroup() %>% filter(class ==
## "R4") %>% mutate(buffer = if_else(buffer == "Mile", 0, 1),
## time = if_else(time == "Before WF", 0, 1)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -944759 -441560 -148154 175690 5350241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 851759 84251 10.110 < 2e-16 ***
## buffer 4792 149444 0.032 0.974447
## time 398000 119149 3.340 0.000962 ***
## buffer:time -32740 211346 -0.155 0.877013
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 790300 on 254 degrees of freedom
## Multiple R-squared: 0.05768, Adjusted R-squared: 0.04655
## F-statistic: 5.183 on 3 and 254 DF, p-value: 0.001716
fixed effects
library(lfe)
## Loading required package: Matrix
library(broom)
FEDF <- mnSalesRepeat %>%
# st_drop_geometry() %>%
mutate(year = year(sale_date)) %>%
group_by(BL, buffer, year) %>%
summarize(last= last(sale_price),
class = first(building_class_at_present),
time = first(time)) %>%
filter(last < 10000000, last > 50000) %>%
mutate(buffer = if_else(buffer == "Mile", 0, 1),
time = if_else(time =="Before WF", 0, 1),
did = buffer * time) %>%
ungroup()
## `summarise()` regrouping output by 'BL', 'buffer' (override with `.groups` argument)
test <- lfe::felm(last~ did | BL + year, FEDF)
summary(test)
##
## Call:
## lfe::felm(formula = last ~ did | BL + year, data = FEDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3174888 -159237 0 139785 3767680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## did 114965 106245 1.082 0.279
##
## Residual standard error: 743400 on 1077 degrees of freedom
## Multiple R-squared(full model): 0.8633 Adjusted R-squared: 0.7534
## Multiple R-squared(proj model): 0.001086 Adjusted R-squared: -0.8021
## F-statistic(full model):7.856 on 866 and 1077 DF, p-value: < 2.2e-16
## F-statistic(proj model): 1.171 on 1 and 1077 DF, p-value: 0.2795
harlemlots <- read_csv(file.path(dataDir, "modelling", "harlemSales_Attributes.csv"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## neighborhood = col_character(),
## building_class_category = col_character(),
## tax_class_at_present = col_character(),
## building_class_at_present = col_character(),
## address = col_character(),
## building_class_at_time_of_sale = col_character(),
## sale_date = col_date(format = ""),
## BoroName = col_character()
## )
## See spec(...) for full column specifications.
harlemlots %>% group_by(year) %>% count()
## # A tibble: 10 x 2
## # Groups: year [10]
## year n
## <dbl> <int>
## 1 2010 313
## 2 2011 451
## 3 2012 619
## 4 2013 705
## 5 2014 644
## 6 2015 650
## 7 2016 620
## 8 2017 542
## 9 2018 544
## 10 2019 573
valueFormula <- sale_price ~
land_square_feet + gross_square_feet + year_built + flagshipParkDist + parkDist + subwayDist +
libDistance + popBlack/TotPop + popHisp/TotPop + avgHHsize + MedHHI + OwnerOccUnits/TotOccUnits +
OO_20k + OO20_49k + OO50_99k + OO50_99k + OO100_149k + OO150_299k + OO300_499k + OO_500_749k +
OO750_999k + OO1mil_ + permits - 1
permits <- permitsWF %>%
group_by(borough, Block, year) %>%
summarize(permits = n())
## `summarise()` regrouping output by 'borough', 'Block' (override with `.groups` argument)
harlemDF <- harlemlots%>%
filter(sale_price < 10000000, sale_price > 50000) %>%
left_join(permits %>% select(-borough)) %>%
left_join(mnSalesJoin %>% select(bbl, block, lot) %>% distinct(.), by = c("Block" = "block", "lot" = "lot"))
## Adding missing grouping variables: `borough`
## Joining, by = c("Block", "year")
harlemDFScale <- harlemDF %>%
mutate_at(vars(
land_square_feet , gross_square_feet , year_built , flagshipParkDist , parkDist , subwayDist ,
libDistance , avgHHsize , MedHHI ,OO_20k, OO20_49k, OO50_99k, OO100_149k,OO150_299k,OO300_499k,
OO_500_749k, OO750_999k, OO1mil_, permits), ~(scale(.) %>% as.vector)) %>%
mutate(sale_price = harlemDF$sale_price) %>%
filter(complete.cases(.))
harlemDFModel <- harlemDFScale %>% tidyr::nest(-year)
## Warning: All elements of `...` must be named.
## Did you want `data = c(neighborhood, building_class_category, tax_class_at_present,
## Block, lot, building_class_at_present, address, zip_code,
## residential_units, commercial_units, total_units, land_square_feet,
## gross_square_feet, year_built, tax_class_at_time_of_sale,
## building_class_at_time_of_sale, sale_price, sale_date, BoroName,
## month, lat, long, HalfMile, flagshipParkDist, parkDist, subwayDist,
## libDistance, resArea, resAreaTot, resPercent, Geo_FIPS, TotPop,
## MalePop, FemalePop, pop_5, pop5_9, pop10_14, pop15_17, pop18_24,
## pop25_34, pop35_44, pop45_54, pop55_64, pop65_74, pop75_84,
## pop85_, popWhite, popBlack, popNatAm, PopAsian, popPac, popOther,
## popTwoRace, popNotHisp, popHisp, popHispWhite, popHispBlack,
## popHispNatAm, popHispAsian, popHisPac, popHispOther, popHispTwoRace,
## avgHHsize, pop25_, lessHSDeg, Hsdeg, someCol, bachelors,
## masters, profDeg, phd, MedHHI, TotOccUnits, RenterUnits,
## OwnerOccUnits, OO_20k, OO20_49k, OO50_99k, OO100_149k, OO150_299k,
## OO300_499k, OO_500_749k, OO750_999k, OO1mil_, borough, permits,
## bbl)`?
formulaList <- list(modlog = valueFormula)
library(purrr)
formulaDF <- map2_dfr(formulaList, formulaList %>% names(), function(x,y)tibble(name = y,formula = list(x)))%>%
tidyr::pivot_wider(names_from = name, values_from = formula)
harlemDFModel <- harlemDFModel %>% rowwise() %>%
bind_cols(formulaDF) %>%
tidyr::pivot_longer(-c(year,data), names_to = "name", values_to = "formula")
modelFunc <- function(formula, data){
mod <- lm(formula, data)
}
predictFunc <- function(mod, data)data %>% mutate(predict = as.vector(predict(mod,data)),
predictexp = exp(predict))
residFunc <- function(data)data %>% mutate(residual = sale_price - predict)
library(useful)
## glmnet
xFunc <- function(formula,data) build.x(formula, data, contrasts=FALSE, sparse=TRUE)
yFunc <- function(formula,data) build.y(formula, data) %>% as.numeric()
glmnetFunc <- function(x, y){
set.seed(990)
mod <- glmnet::glmnet(x=x, y=y,
family='gaussian', standardize = FALSE)
}
predictGlmnet <- function(mod, x)tibble(predictGLMNET = as.vector(predict(mod,x,s=0.01)),
predictGLMENTexp = exp(predictGLMNET))
modList <- harlemDFModel %>% mutate(
lmmods = map2(formula,data, ~modelFunc(.x,.y)),
data = map2(lmmods, data,~predictFunc(.x,.y)),
data = map(data, ~residFunc(.x)),
x = map2(formula,data,~xFunc(.x,.y)),
y = map2(formula,data,~yFunc(.x,.y)),
glmnetMods = map2(x,y, ~glmnetFunc(.x,.y)),
dataGlmnet = map2(glmnetMods, x,~predictGlmnet(.x,.y)))
modList$data[[1]] %>% ggplot(aes(x=residual)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
harlemPLUTO <- st_read(file.path(shapeDir, "harlemPoints"))
## Reading layer `harlemPoints' from data source `/Users/kaz/Documents/Fall2020/WholeFoodsStudy/shape/harlemPoints' using driver `ESRI Shapefile'
## Simple feature collection with 9581 features and 93 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 991882.8 ymin: 224362.6 xmax: 1005913 ymax: 240429.9
## projected CRS: NAD83 / New York Long Island (ftUS)
map <- harlemPLUTO %>% inner_join(modList$data[[1]], by = c("BBL" = "bbl"))
tm_shape(wfHarlemBufferOne) +
tm_borders() +
tm_shape(wfHarlemBufferHalf) +
tm_borders() +
tm_shape(map) +
tm_symbols(col = "residual")
## Variable(s) "residual" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
allunitsDF <- harlemDFScale %>% select(c(land_square_feet, gross_square_feet, year_built, flagshipParkDist, parkDist, subwayDist,
libDistance, avgHHsize, popBlack, TotPop, popHisp, MedHHI, OwnerOccUnits, TotOccUnits,
OO_20k, OO20_49k, OO50_99k, OO100_149k,OO150_299k,OO300_499k,
OO_500_749k, OO750_999k, OO1mil_, permits, bbl)) %>% mutate(sale_price = bbl) %>% distinct(.)
modList <- modList %>% mutate(predictData = map(lmmods,~predictFunc(.x,allunitsDF)),
xall = map(formula,~xFunc(.x,allunitsDF)),
dataGlmnetall = map2(glmnetMods, xall,~predictGlmnet(.x,.y)))
test <- modList %>% select(year, predictData) %>% tidyr::unnest(predictData) %>%
select(year, bbl, predict,predictexp) %>%
bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall)) %>%
left_join(mnSalesJoin %>% select(bbl, time, buffer), by = c("bbl" = "bbl")) %>%
group_by(year, buffer) %>%
summarize(meanPriceLM = mean(predict),
medPriceLM = median(predict),
meanPriceGL = mean(predictGLMNET),
medPriceGL = median(predictGLMNET))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
testPlot <- test %>% tidyr::pivot_longer(names_to = "mods", values_to = "price", -c(year, buffer))
testPlot %>%
ggplot(aes(year, price, group = mods)) +
geom_point(aes(group = mods, color = mods, shape=buffer)) +
geom_line(aes(group = buffer, color = mods), alpha = .2 ) +
facet_wrap(~mods) +
theme_minimal()
FEDFHed <- modList %>% select(year, predictData) %>% tidyr::unnest(predictData) %>%
select(year, bbl, predict,predictexp) %>%
bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall)) %>%
left_join(mnSalesJoin %>% select(bbl,buffer), by = c("bbl" = "bbl")) %>%
mutate(time = if_else(year < 2017, "Before WF", "After WF")) %>%
filter(predict < 10000000, predict > 50000,predictGLMNET < 10000000, predictGLMNET > 50000) %>%
mutate(buffer = if_else(buffer == "Mile", 0, 1),
time = if_else(time =="Before WF", 0, 1),
did = buffer * time) %>%
ungroup()
feHedLM <- lfe::felm(predict~ did | bbl + year, FEDFHed)
summary(feHedLM)
##
## Call:
## lfe::felm(formula = predict ~ did | bbl + year, data = FEDFHed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8074762 -748610 -225584 456160 7286771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## did 596044 22232 26.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1480000 on 136428 degrees of freedom
## Multiple R-squared(full model): 0.4048 Adjusted R-squared: 0.399
## Multiple R-squared(proj model): 0.005241 Adjusted R-squared: -0.004486
## F-statistic(full model):69.56 on 1334 and 136428 DF, p-value: < 2.2e-16
## F-statistic(proj model): 718.8 on 1 and 136428 DF, p-value: < 2.2e-16
feHedGLM <- lfe::felm(predictGLMNET~ did | bbl + year, FEDFHed)
summary(feHedGLM)
##
## Call:
## lfe::felm(formula = predictGLMNET ~ did | bbl + year, data = FEDFHed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1843631 -293620 18218 314650 3050052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## did 161988 7650 21.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 509100 on 136428 degrees of freedom
## Multiple R-squared(full model): 0.5626 Adjusted R-squared: 0.5583
## Multiple R-squared(proj model): 0.003276 Adjusted R-squared: -0.00647
## F-statistic(full model):131.6 on 1334 and 136428 DF, p-value: < 2.2e-16
## F-statistic(proj model): 448.4 on 1 and 136428 DF, p-value: < 2.2e-16
didreglm = lm(predict ~ buffer * time, data = FEDFHed)
summary(didreglm)
##
## Call:
## lm(formula = predict ~ buffer * time, data = FEDFHed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3138149 -969180 -537636 315590 8657939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1310244 6361 205.99 <2e-16 ***
## buffer 417263 14741 28.31 <2e-16 ***
## time 1100246 12481 88.15 <2e-16 ***
## buffer:time 363139 26871 13.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1820000 on 137759 degrees of freedom
## Multiple R-squared: 0.09128, Adjusted R-squared: 0.09126
## F-statistic: 4613 on 3 and 137759 DF, p-value: < 2.2e-16
didregGLM = lm(predictGLMNET ~ buffer * time, data = FEDFHed)
summary(didregGLM)
##
## Call:
## lm(formula = predictGLMNET ~ buffer * time, data = FEDFHed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1349178 -557740 -118913 426619 4018316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1354230 2586 523.75 <2e-16 ***
## buffer 162794 5992 27.17 <2e-16 ***
## time 332040 5074 65.45 <2e-16 ***
## buffer:time 221489 10923 20.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 739600 on 137759 degrees of freedom
## Multiple R-squared: 0.06794, Adjusted R-squared: 0.06792
## F-statistic: 3347 on 3 and 137759 DF, p-value: < 2.2e-16
FEDFHed %>% select(year, bbl, predict, time, buffer) %>%
ggplot(aes(x=year, y=predict, group = bbl)) +
geom_line(alpha = .1, aes(color = time)) +
facet_wrap(~buffer) +
theme_minimal()
FEDFHed %>% select(year, bbl, predict, time, buffer) %>%
ggplot(aes(x=year, y=predict)) +
geom_hex(bins = 10) +
facet_wrap(~buffer) +
theme_minimal()
FEDFHed %>% select(year, bbl, predict, time, buffer) %>%
ggplot(aes(x=year, y=predict)) +
geom_bin2d(bins = c(5,20)) +
facet_wrap(~buffer) +
theme_minimal()
FEDFHed %>% select(year, bbl, predictGLMNET, time, buffer) %>%
ggplot(aes(x=year, y=predictGLMNET, group = bbl)) +
geom_line(alpha = .1, aes(color = time)) +
facet_wrap(~buffer) +
theme_minimal()
FEDFHed %>% select(year, bbl, predictGLMNET, time, buffer) %>%
ggplot(aes(x=year, y=predictGLMNET)) +
geom_hex(bins = 10) +
facet_wrap(~buffer) +
theme_minimal()
FEDFHed %>% select(year, bbl, predictGLMNET, time, buffer) %>%
ggplot(aes(x=year, y=predictGLMNET)) +
geom_bin2d(bins = c(5,20)) +
facet_wrap(~buffer) +
theme_minimal()
for each distance for each year
## for each distance
distanceList <- as.list(seq(1320,4620, 660))
bufferFunction <- function(buffer, data, wf){
varName <- paste0("buffer_", buffer)
wfBuffer <- wf %>% st_buffer(buffer)
output <- data %>%
filter(!Block %in% c(1111, 1897)) %>%
sf::st_join(wfBuffer %>%mutate(!!varName := 1),join =st_within) %>%
st_drop_geometry() %>%
select(!!varName)
}
distanceBuffers <- map_dfc(distanceList, ~bufferFunction(.x, plutoWF, wfHarlem))
distanceBuffers <- distanceBuffers %>% replace(is.na(.), 0)
joinBuffer <- harlemDFScale %>% select(c(land_square_feet, gross_square_feet, year_built, flagshipParkDist, parkDist, subwayDist,
libDistance, avgHHsize, popBlack, TotPop, popHisp, MedHHI, OwnerOccUnits, TotOccUnits,
OO_20k, OO20_49k, OO50_99k, OO100_149k,OO150_299k,OO300_499k,
OO_500_749k, OO750_999k, OO1mil_, permits, Block, bbl)) %>% distinct(.) %>%
left_join(plutoWF %>% bind_cols(distanceBuffers) %>% st_drop_geometry %>% distinct(.)) %>%
select(starts_with("buffer_"))
## Joining, by = "Block"
modBuffer <- modList %>%
mutate(joinBuffer = map(year, function(x) joinBuffer)) %>%
mutate(bufferDF = map2(.x = predictData,.y = joinBuffer,.f = function(x,y) tibble(x %>% bind_cols(y))))
modBufferPlot <- modBuffer %>% select(year, bufferDF) %>% tidyr::unnest(bufferDF) %>%
select(year, bbl, predict, starts_with("buffer_")) %>%
bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall)) %>%
select(-predictGLMENTexp) %>%
tidyr::pivot_longer(names_to = "bufferType", values_to = "treatment", -c(year, bbl, predict, predictGLMNET)) %>%
group_by(year, bufferType, treatment) %>%
summarize(meanPriceLM = mean(predict),
medPriceLM = median(predict),
meanPriceGL = mean(predictGLMNET),
medPriceGL = median(predictGLMNET))
## `summarise()` regrouping output by 'year', 'bufferType' (override with `.groups` argument)
modBufferPlot %>%
tidyr::pivot_longer(names_to = "mods", values_to = "price", -c(year, bufferType, treatment)) %>%
mutate(treatment = as.factor(treatment)) %>%
ggplot(aes(year, price, group = mods)) +
geom_point(aes(group = mods, color = mods, shape=treatment)) +
geom_line(aes(group = treatment, color = mods), alpha = .2 ) +
facet_grid(mods~bufferType, scales = "free_y") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
FEDBufferDF <- modBuffer %>% select(year, bufferDF) %>% tidyr::unnest(bufferDF) %>%
select(year, bbl, predict, starts_with("buffer")) %>%
bind_cols(modList %>% select(dataGlmnetall) %>% tidyr::unnest(dataGlmnetall) %>% select(predictGLMNET)) %>%
filter(predict < 10000000, predict > 50000,predictGLMNET < 10000000, predictGLMNET > 50000) %>%
ungroup()
fedDiagFunc <- function(distance, df, wfYear = 2017, FE = TRUE, mod = "LM"){
varName = paste0("buffer_", distance)
if(mod != "LM") FEDBufferDF <- FEDBufferDF %>% select(-predict) %>% rename(predict = predictGLMNET)
modDF <- FEDBufferDF %>% select(predict, !!varName, bbl, year) %>%
mutate(time = if_else(year < wfYear, 0, 1)) %>%
mutate(did = time * !!as.name(varName))
if(FE) feHedLM <- lfe::felm(predict~ did | bbl + year, modDF)
else feHedLM <- lfe::felm(predict~ did, modDF)
rmse <- sqrt(mean(feHedLM$residuals^2))
}
rmseVec <- map_dbl(distanceList, ~fedDiagFunc(.x, FEDBufferDF,wfYear = 2017, FE = TRUE, mod = "GLMNET"))
modGrid <- expand.grid(distance = seq(1320,4620, 660),
year = seq(2011,2018,1),
model = c("LM", "GLMENT"),stringsAsFactors = FALSE)
modGridList <- split(modGrid, seq(nrow(modGrid)))
rmseVec <- modGridList %>% map_dbl(~fedDiagFunc(.x$distance, FEDBufferDF, .x$year,FE = TRUE, mod=.x$model))
modGrid <- modGrid %>% mutate(rmse = rmseVec)
modGrid %>% mutate(distance = as.factor(distance)) %>%
ggplot(aes(year, rmse, group = model, color = model)) +
geom_line() +
facet_wrap(~distance) +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
modGrid %>% mutate(distance = as.factor(distance)) %>%
ggplot(aes(year, rmse, group = model, color = model)) +
geom_line() +
facet_grid(model~distance, scales = "free_y") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))